{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Handling Imbalanced Stock Data with SMOTE and Near Miss Algorithm"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "Synthetic Minority Oversampling Technique (SMOTE) is imbalanced classification that use to developing predictive models on classification datasets that have a severe class imbalance. Near-miss is an algorithm that can help in balancing an imbalanced dataset. "
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import math\n",
        "\n",
        "import os\n",
        "import sys\n",
        "import platform\n",
        "\n",
        "import sklearn\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.metrics import confusion_matrix, classification_report\n",
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "%matplotlib inline\n",
        "\n",
        "import warnings\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "\n",
        "import yfinance as yf\n",
        "yf.pdr_override()"
      ],
      "outputs": [],
      "execution_count": 1,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false,
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:50.708Z",
          "iopub.execute_input": "2022-11-05T02:52:50.713Z",
          "iopub.status.idle": "2022-11-05T02:52:51.240Z",
          "shell.execute_reply": "2022-11-05T02:52:51.284Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "print(\"numpy: \", np.__version__)\n",
        "print(\"pandas: \", pd.__version__)\n",
        "print(\"sklearn: \", sklearn.__version__)\n",
        "print(\"yfinance: \", yf.__version__)\n",
        "print(\"os system: \", os.name)\n",
        "print(\"Python Version:\", sys.version)\n",
        "print(\"Platform System: \", platform.system())"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "numpy:  1.19.5\n",
            "pandas:  1.1.5\n",
            "sklearn:  0.22.1\n",
            "yfinance:  0.1.77\n",
            "os system:  nt\n",
            "Python Version: 3.6.13 |Anaconda, Inc.| (default, Mar 16 2021, 11:37:27) [MSC v.1916 64 bit (AMD64)]\n",
            "Platform System:  Windows\n"
          ]
        }
      ],
      "execution_count": 2,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:51.246Z",
          "iopub.execute_input": "2022-11-05T02:52:51.250Z",
          "iopub.status.idle": "2022-11-05T02:52:51.260Z",
          "shell.execute_reply": "2022-11-05T02:52:51.288Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "stock = 'AAPL'\n",
        "start = '2016-01-01' \n",
        "end = '2022-01-01'\n",
        "dataset = yf.download(stock, start, end)\n",
        "dataset.head()"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[*********************100%***********************]  1 of 1 completed\n"
          ]
        },
        {
          "output_type": "execute_result",
          "execution_count": 3,
          "data": {
            "text/plain": "                                Open       High        Low      Close  \\\nDate                                                                    \n2016-01-04 00:00:00-05:00  25.652500  26.342501  25.500000  26.337500   \n2016-01-05 00:00:00-05:00  26.437500  26.462500  25.602501  25.677500   \n2016-01-06 00:00:00-05:00  25.139999  25.592501  24.967501  25.174999   \n2016-01-07 00:00:00-05:00  24.670000  25.032499  24.107500  24.112499   \n2016-01-08 00:00:00-05:00  24.637501  24.777500  24.190001  24.240000   \n\n                           Adj Close     Volume  \nDate                                             \n2016-01-04 00:00:00-05:00  24.151493  270597600  \n2016-01-05 00:00:00-05:00  23.546274  223164000  \n2016-01-06 00:00:00-05:00  23.085480  273829600  \n2016-01-07 00:00:00-05:00  22.111166  324377600  \n2016-01-08 00:00:00-05:00  22.228085  283192000  ",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Open</th>\n      <th>High</th>\n      <th>Low</th>\n      <th>Close</th>\n      <th>Adj Close</th>\n      <th>Volume</th>\n    </tr>\n    <tr>\n      <th>Date</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>2016-01-04 00:00:00-05:00</th>\n      <td>25.652500</td>\n      <td>26.342501</td>\n      <td>25.500000</td>\n      <td>26.337500</td>\n      <td>24.151493</td>\n      <td>270597600</td>\n    </tr>\n    <tr>\n      <th>2016-01-05 00:00:00-05:00</th>\n      <td>26.437500</td>\n      <td>26.462500</td>\n      <td>25.602501</td>\n      <td>25.677500</td>\n      <td>23.546274</td>\n      <td>223164000</td>\n    </tr>\n    <tr>\n      <th>2016-01-06 00:00:00-05:00</th>\n      <td>25.139999</td>\n      <td>25.592501</td>\n      <td>24.967501</td>\n      <td>25.174999</td>\n      <td>23.085480</td>\n      <td>273829600</td>\n    </tr>\n    <tr>\n      <th>2016-01-07 00:00:00-05:00</th>\n      <td>24.670000</td>\n      <td>25.032499</td>\n      <td>24.107500</td>\n      <td>24.112499</td>\n      <td>22.111166</td>\n      <td>324377600</td>\n    </tr>\n    <tr>\n      <th>2016-01-08 00:00:00-05:00</th>\n      <td>24.637501</td>\n      <td>24.777500</td>\n      <td>24.190001</td>\n      <td>24.240000</td>\n      <td>22.228085</td>\n      <td>283192000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 3,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false,
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:51.268Z",
          "iopub.execute_input": "2022-11-05T02:52:51.272Z",
          "iopub.status.idle": "2022-11-05T02:52:52.225Z",
          "shell.execute_reply": "2022-11-05T02:52:52.410Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)\n",
        "dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)\n",
        "dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)\n",
        "dataset['Returns'] = dataset['Adj Close'].pct_change()\n",
        "dataset['Returns_Buy_Sell'] = np.where(dataset['Returns'].shift(-1) > dataset['Returns'],1,0)\n",
        "dataset = dataset.dropna()"
      ],
      "outputs": [],
      "execution_count": 4,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:52.231Z",
          "iopub.execute_input": "2022-11-05T02:52:52.236Z",
          "iopub.status.idle": "2022-11-05T02:52:52.243Z",
          "shell.execute_reply": "2022-11-05T02:52:52.415Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "dataset = dataset.reset_index()\n",
        "dataset"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 5,
          "data": {
            "text/plain": "                          Date        Open        High         Low  \\\n0    2016-01-05 00:00:00-05:00   26.437500   26.462500   25.602501   \n1    2016-01-06 00:00:00-05:00   25.139999   25.592501   24.967501   \n2    2016-01-07 00:00:00-05:00   24.670000   25.032499   24.107500   \n3    2016-01-08 00:00:00-05:00   24.637501   24.777500   24.190001   \n4    2016-01-11 00:00:00-05:00   24.742500   24.764999   24.334999   \n...                        ...         ...         ...         ...   \n1505 2021-12-27 00:00:00-05:00  177.089996  180.419998  177.070007   \n1506 2021-12-28 00:00:00-05:00  180.160004  181.330002  178.529999   \n1507 2021-12-29 00:00:00-05:00  179.330002  180.630005  178.139999   \n1508 2021-12-30 00:00:00-05:00  179.470001  180.570007  178.089996   \n1509 2021-12-31 00:00:00-05:00  178.089996  179.229996  177.259995   \n\n           Close   Adj Close     Volume  Increase_Decrease  Buy_Sell_on_Open  \\\n0      25.677500   23.546274  223164000                  1                 0   \n1      25.174999   23.085480  273829600                  1                 0   \n2      24.112499   22.111166  324377600                  0                 0   \n3      24.240000   22.228085  283192000                  0                 1   \n4      24.632500   22.588015  198957600                  0                 1   \n...          ...         ...        ...                ...               ...   \n1505  180.330002  179.586868   74919600                  1                 1   \n1506  179.289993  178.551132   79144300                  0                 0   \n1507  179.380005  178.640778   62348900                  0                 1   \n1508  178.199997  177.465637   59773000                  1                 0   \n1509  177.570007  176.838226   64062300                  0                 0   \n\n      Buy_Sell   Returns  Returns_Buy_Sell  \n0            0 -0.025059                 1  \n1            0 -0.019570                 0  \n2            1 -0.042205                 1  \n3            1  0.005288                 1  \n4            1  0.016193                 0  \n...        ...       ...               ...  \n1505         0  0.022975                 0  \n1506         1 -0.005767                 1  \n1507         0  0.000502                 0  \n1508         0 -0.006578                 1  \n1509         0 -0.003535                 0  \n\n[1510 rows x 12 columns]",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Date</th>\n      <th>Open</th>\n      <th>High</th>\n      <th>Low</th>\n      <th>Close</th>\n      <th>Adj Close</th>\n      <th>Volume</th>\n      <th>Increase_Decrease</th>\n      <th>Buy_Sell_on_Open</th>\n      <th>Buy_Sell</th>\n      <th>Returns</th>\n      <th>Returns_Buy_Sell</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2016-01-05 00:00:00-05:00</td>\n      <td>26.437500</td>\n      <td>26.462500</td>\n      <td>25.602501</td>\n      <td>25.677500</td>\n      <td>23.546274</td>\n      <td>223164000</td>\n      <td>1</td>\n      <td>0</td>\n      <td>0</td>\n      <td>-0.025059</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2016-01-06 00:00:00-05:00</td>\n      <td>25.139999</td>\n      <td>25.592501</td>\n      <td>24.967501</td>\n      <td>25.174999</td>\n      <td>23.085480</td>\n      <td>273829600</td>\n      <td>1</td>\n      <td>0</td>\n      <td>0</td>\n      <td>-0.019570</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2016-01-07 00:00:00-05:00</td>\n      <td>24.670000</td>\n      <td>25.032499</td>\n      <td>24.107500</td>\n      <td>24.112499</td>\n      <td>22.111166</td>\n      <td>324377600</td>\n      <td>0</td>\n      <td>0</td>\n      <td>1</td>\n      <td>-0.042205</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2016-01-08 00:00:00-05:00</td>\n      <td>24.637501</td>\n      <td>24.777500</td>\n      <td>24.190001</td>\n      <td>24.240000</td>\n      <td>22.228085</td>\n      <td>283192000</td>\n      <td>0</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0.005288</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2016-01-11 00:00:00-05:00</td>\n      <td>24.742500</td>\n      <td>24.764999</td>\n      <td>24.334999</td>\n      <td>24.632500</td>\n      <td>22.588015</td>\n      <td>198957600</td>\n      <td>0</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0.016193</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1505</th>\n      <td>2021-12-27 00:00:00-05:00</td>\n      <td>177.089996</td>\n      <td>180.419998</td>\n      <td>177.070007</td>\n      <td>180.330002</td>\n      <td>179.586868</td>\n      <td>74919600</td>\n      <td>1</td>\n      <td>1</td>\n      <td>0</td>\n      <td>0.022975</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1506</th>\n      <td>2021-12-28 00:00:00-05:00</td>\n      <td>180.160004</td>\n      <td>181.330002</td>\n      <td>178.529999</td>\n      <td>179.289993</td>\n      <td>178.551132</td>\n      <td>79144300</td>\n      <td>0</td>\n      <td>0</td>\n      <td>1</td>\n      <td>-0.005767</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1507</th>\n      <td>2021-12-29 00:00:00-05:00</td>\n      <td>179.330002</td>\n      <td>180.630005</td>\n      <td>178.139999</td>\n      <td>179.380005</td>\n      <td>178.640778</td>\n      <td>62348900</td>\n      <td>0</td>\n      <td>1</td>\n      <td>0</td>\n      <td>0.000502</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1508</th>\n      <td>2021-12-30 00:00:00-05:00</td>\n      <td>179.470001</td>\n      <td>180.570007</td>\n      <td>178.089996</td>\n      <td>178.199997</td>\n      <td>177.465637</td>\n      <td>59773000</td>\n      <td>1</td>\n      <td>0</td>\n      <td>0</td>\n      <td>-0.006578</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1509</th>\n      <td>2021-12-31 00:00:00-05:00</td>\n      <td>178.089996</td>\n      <td>179.229996</td>\n      <td>177.259995</td>\n      <td>177.570007</td>\n      <td>176.838226</td>\n      <td>64062300</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>-0.003535</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n<p>1510 rows × 12 columns</p>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 5,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:52.249Z",
          "iopub.execute_input": "2022-11-05T02:52:52.254Z",
          "iopub.status.idle": "2022-11-05T02:52:52.264Z",
          "shell.execute_reply": "2022-11-05T02:52:52.426Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "dataset.info()"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "<class 'pandas.core.frame.DataFrame'>\n",
            "RangeIndex: 1510 entries, 0 to 1509\n",
            "Data columns (total 12 columns):\n",
            " #   Column             Non-Null Count  Dtype                           \n",
            "---  ------             --------------  -----                           \n",
            " 0   Date               1510 non-null   datetime64[ns, America/New_York]\n",
            " 1   Open               1510 non-null   float64                         \n",
            " 2   High               1510 non-null   float64                         \n",
            " 3   Low                1510 non-null   float64                         \n",
            " 4   Close              1510 non-null   float64                         \n",
            " 5   Adj Close          1510 non-null   float64                         \n",
            " 6   Volume             1510 non-null   int64                           \n",
            " 7   Increase_Decrease  1510 non-null   int32                           \n",
            " 8   Buy_Sell_on_Open   1510 non-null   int32                           \n",
            " 9   Buy_Sell           1510 non-null   int32                           \n",
            " 10  Returns            1510 non-null   float64                         \n",
            " 11  Returns_Buy_Sell   1510 non-null   int32                           \n",
            "dtypes: datetime64[ns, America/New_York](1), float64(6), int32(4), int64(1)\n",
            "memory usage: 118.1 KB\n"
          ]
        }
      ],
      "execution_count": 6,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false,
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:52.271Z",
          "iopub.execute_input": "2022-11-05T02:52:52.275Z",
          "iopub.status.idle": "2022-11-05T02:52:52.285Z",
          "shell.execute_reply": "2022-11-05T02:52:52.430Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Normalize the \"Buy_Sell\" column\n",
        "dataset['nearClose'] = StandardScaler().fit_transform(np.array(dataset['Adj Close']).reshape(-1, 1))\n",
        "\n",
        "# drop Date and Close columns as they are not relevant for prediction purpose \n",
        "dataset = dataset.drop(['Date', 'Close'], axis = 1)\n",
        "\n",
        "dataset['Buy_Sell'].value_counts()"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 7,
          "data": {
            "text/plain": "1    817\n0    693\nName: Buy_Sell, dtype: int64"
          },
          "metadata": {}
        }
      ],
      "execution_count": 7,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:52.291Z",
          "iopub.execute_input": "2022-11-05T02:52:52.295Z",
          "iopub.status.idle": "2022-11-05T02:52:52.305Z",
          "shell.execute_reply": "2022-11-05T02:52:52.433Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "X = dataset.drop(['Returns_Buy_Sell'], axis=1)\n",
        "y = dataset['Returns_Buy_Sell']"
      ],
      "outputs": [],
      "execution_count": 8,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:52.311Z",
          "iopub.execute_input": "2022-11-05T02:52:52.316Z",
          "iopub.status.idle": "2022-11-05T02:52:52.324Z",
          "shell.execute_reply": "2022-11-05T02:52:52.436Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# split into 70:30 ratio\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)\n",
        "  \n",
        "# describes info about train and test set\n",
        "print(\"Number transactions X_train dataset: \", X_train.shape)\n",
        "print(\"Number transactions y_train dataset: \", y_train.shape)\n",
        "print(\"Number transactions X_test dataset: \", X_test.shape)\n",
        "print(\"Number transactions y_test dataset: \", y_test.shape)"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Number transactions X_train dataset:  (1057, 10)\n",
            "Number transactions y_train dataset:  (1057,)\n",
            "Number transactions X_test dataset:  (453, 10)\n",
            "Number transactions y_test dataset:  (453,)\n"
          ]
        }
      ],
      "execution_count": 9,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false,
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:52.329Z",
          "iopub.execute_input": "2022-11-05T02:52:52.334Z",
          "iopub.status.idle": "2022-11-05T02:52:52.343Z",
          "shell.execute_reply": "2022-11-05T02:52:52.439Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# logistic regression object\n",
        "lr = LogisticRegression()\n",
        "  \n",
        "# train the model on train set\n",
        "lr.fit(X_train, y_train.ravel())\n",
        "  \n",
        "predictions = lr.predict(X_test)\n",
        "  \n",
        "# print classification report\n",
        "print(classification_report(y_test, predictions))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.49      1.00      0.66       222\n",
            "           1       0.00      0.00      0.00       231\n",
            "\n",
            "    accuracy                           0.49       453\n",
            "   macro avg       0.25      0.50      0.33       453\n",
            "weighted avg       0.24      0.49      0.32       453\n",
            "\n"
          ]
        }
      ],
      "execution_count": 10,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false,
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:52.349Z",
          "iopub.execute_input": "2022-11-05T02:52:52.354Z",
          "shell.execute_reply": "2022-11-05T02:52:52.441Z",
          "iopub.status.idle": "2022-11-05T02:52:52.363Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import sklearn.neighbors._base\n",
        "sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base\n",
        "\n",
        "print(\"Before OverSampling, counts of label '1': {}\".format(sum(y_train == 1)))\n",
        "print(\"Before OverSampling, counts of label '0': {} \\n\".format(sum(y_train == 0)))\n",
        "  \n",
        "# import SMOTE module from imblearn library\n",
        "# pip install imblearn (if you don't have imblearn in your system)\n",
        "from imblearn.over_sampling import SMOTE\n",
        "sm = SMOTE(random_state = 2)\n",
        "X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())\n",
        "  \n",
        "print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))\n",
        "print('After OverSampling, the shape of train_y: {} \\n'.format(y_train_res.shape))\n",
        "  \n",
        "print(\"After OverSampling, counts of label '1': {}\".format(sum(y_train_res == 1)))\n",
        "print(\"After OverSampling, counts of label '0': {}\".format(sum(y_train_res == 0)))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Before OverSampling, counts of label '1': 517\n",
            "Before OverSampling, counts of label '0': 540 \n",
            "\n",
            "After OverSampling, the shape of train_X: (1080, 10)\n",
            "After OverSampling, the shape of train_y: (1080,) \n",
            "\n",
            "After OverSampling, counts of label '1': 540\n",
            "After OverSampling, counts of label '0': 540\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "Using TensorFlow backend.\n"
          ]
        }
      ],
      "execution_count": 11,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false,
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:52.370Z",
          "iopub.execute_input": "2022-11-05T02:52:52.375Z",
          "iopub.status.idle": "2022-11-05T02:52:53.469Z",
          "shell.execute_reply": "2022-11-05T02:52:53.549Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "lr1 = LogisticRegression()\n",
        "lr1.fit(X_train_res, y_train_res.ravel())\n",
        "predictions = lr1.predict(X_test)\n",
        "  \n",
        "# print classification report\n",
        "print(classification_report(y_test, predictions))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.00      0.00      0.00       222\n",
            "           1       0.51      1.00      0.68       231\n",
            "\n",
            "    accuracy                           0.51       453\n",
            "   macro avg       0.25      0.50      0.34       453\n",
            "weighted avg       0.26      0.51      0.34       453\n",
            "\n"
          ]
        }
      ],
      "execution_count": 12,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false,
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:53.476Z",
          "iopub.execute_input": "2022-11-05T02:52:53.480Z",
          "iopub.status.idle": "2022-11-05T02:52:53.491Z",
          "shell.execute_reply": "2022-11-05T02:52:53.554Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "print(\"Before Undersampling, counts of label '1': {}\".format(sum(y_train == 1)))\n",
        "print(\"Before Undersampling, counts of label '0': {} \\n\".format(sum(y_train == 0)))\n",
        "  \n",
        "# apply near miss\n",
        "from imblearn.under_sampling import NearMiss\n",
        "nr = NearMiss()\n",
        "  \n",
        "X_train_miss, y_train_miss = nr.fit_sample(X_train, y_train.ravel())\n",
        "  \n",
        "print('After Undersampling, the shape of train_X: {}'.format(X_train_miss.shape))\n",
        "print('After Undersampling, the shape of train_y: {} \\n'.format(y_train_miss.shape))\n",
        "  \n",
        "print(\"After Undersampling, counts of label '1': {}\".format(sum(y_train_miss == 1)))\n",
        "print(\"After Undersampling, counts of label '0': {}\".format(sum(y_train_miss == 0)))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Before Undersampling, counts of label '1': 517\n",
            "Before Undersampling, counts of label '0': 540 \n",
            "\n",
            "After Undersampling, the shape of train_X: (1034, 10)\n",
            "After Undersampling, the shape of train_y: (1034,) \n",
            "\n",
            "After Undersampling, counts of label '1': 517\n",
            "After Undersampling, counts of label '0': 517\n"
          ]
        }
      ],
      "execution_count": 13,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false,
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:53.499Z",
          "iopub.execute_input": "2022-11-05T02:52:53.504Z",
          "iopub.status.idle": "2022-11-05T02:52:53.513Z",
          "shell.execute_reply": "2022-11-05T02:52:53.557Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# train the model on train set\n",
        "lr2 = LogisticRegression()\n",
        "lr2.fit(X_train_miss, y_train_miss.ravel())\n",
        "predictions = lr2.predict(X_test)\n",
        "  \n",
        "# print classification report\n",
        "print(classification_report(y_test, predictions))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.00      0.00      0.00       222\n",
            "           1       0.51      1.00      0.68       231\n",
            "\n",
            "    accuracy                           0.51       453\n",
            "   macro avg       0.25      0.50      0.34       453\n",
            "weighted avg       0.26      0.51      0.34       453\n",
            "\n"
          ]
        }
      ],
      "execution_count": 14,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false,
        "execution": {
          "iopub.status.busy": "2022-11-05T02:52:53.519Z",
          "iopub.execute_input": "2022-11-05T02:52:53.523Z",
          "iopub.status.idle": "2022-11-05T02:52:53.536Z",
          "shell.execute_reply": "2022-11-05T02:52:53.560Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "The first model of the accuracy comes out to be 100% and the second model of the accuracy comes out to be 100%.  They are both biased twoards majority class."
      ],
      "outputs": [],
      "execution_count": null,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    }
  ],
  "metadata": {
    "kernel_info": {
      "name": "python3"
    },
    "kernelspec": {
      "name": "python3",
      "language": "python",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python",
      "version": "3.6.13",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "nteract": {
      "version": "0.28.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}